# load WB data, clean it, and then estimate idealstan model

options(contrasts=c("contr.treatment","contr.treatment"))


library(tidyverse)
library(haven)
# to install, first install package remotes
# then do remotes::install_github("saudiwin/idealstan")
library(idealstan)

# note that cmdstanr must also be installed separately, see
# https://mc-stan.org/cmdstanr/

library(cmdstanr)

# use a slightly older version for compatibility
# use: cmdstanr::install_cmdstan(dir=".", version="2.32.1",overwrite=T)

set_cmdstan_path("cmdstan-2.32.1")

set.seed(2722137)

# location where all data are saved (should be set to folder containing the files from data verse)

save_loc <- "~/wbsurvey/Replication Package 2/In"

# environmental variable for tracking purposes

run <- "replication"

# for reproducibilty

seed <- 723615

set.seed(seed)

# whether to run imputation algorithm on ideal point covariates or load 
# from saved object (takes about 1 hr)

impute <- F

# raw WB survey data

wb_data <- read_dta(paste0(save_loc,"/combined_data.dta")) %>% 
  filter(!(is.na(BMb5) & wbcode=="MAR"))

# sector definitions

new_ind <- read_dta(paste0(save_loc, "/idstd_Eurostat_agg_sector.dta")) %>% 
  mutate(agg_sector=as_factor(agg_sector))

# refusal rates

extra_cov <- read_dta(paste0(save_loc, "/idstd_wRefusalRates.dta"))

# country-level covariates

country_cov <- read_dta(paste0(save_loc, "/idstd_wCtyCovariates.dta"))

# more country-level covariates

ccov2 <- read_dta(paste0(save_loc, "/idstd_wVDEM_GDPpc.dta")) %>% 
  select(idstd,wbcode,country,year,v2x_polyarchy,realGDPpc)

# create a smaller firm survey data set from the full data set

wb_small <- select(wb_data,size_num,idstd,wbcode,countryx,country,obs,year,region,
                   region_old,sector_3,d1a2,ownership,exporter,size,wt,
                   perf1, perf2, tax_visit="reg2",bribe_depth="graft2",
                   bribe_common="graft3",years_unreg="infor5",gender="gend2",
                   int_funds="fin6",bank_funds="fin7",bank_cred="fin14",
                   elec_out="in2",
                   a3a,a3ax,a2,a2x,strata,input_bribes1="graft3",
                   input_bribes2="corr4",
                   input_govtown="car4",
                   firm_age="car1",
                   man_exp="wk8",
                   quality_cert="t1",
                   conglomerate="a7",
                   input_size="size_num",
                   input_pc="pclead",
                   input_sector="isic",
                   input_assn="BMb6",
                   input_lobby="BMb8d",
                   dir_timetax="j2",
                   dir_customsobst="d30b",
                   dir_courtsobst="h30",
                   dir_taxrobst="j30a",
                   dir_poliobst="j30e",
                   dir_corrobst="j30f",
                   dir_laborobst="l30a",
                   sel_elect="c3",
                   delay_elect="c4",
                   bribe_elect="c5",
                   sel_h20="c12",
                   delay_h20="c13",
                   bribe_h20="c15",
                   sel_constr="g2",
                   delay_constr="g3",
                   bribe_constr="j5",
                   sel_import="j10",
                   delay_license="j11",
                   bribe_license="j12",
                   sel_oper="j13",
                   delay_oper="j14",
                   bribe_oper="j15",
                   delay_Xcustoms="d4",
                   #bribe_Xcustoms="d5", completely missing
                   delay_Mcustoms="d14",
                   bribe_Mcustoms="d15a") %>% 
  mutate(id=1) %>% 
  spread(key="input_sector",value="id",fill=0) %>% 
  select(-`<NA>`)

# merge extra covariates

wb_small <- left_join(wb_small,select(extra_cov,
                                      country,
                                      idstd,
                                      refusalrate,
                                      polconiii,
                                      polconv,
                                      j,
                                      legfralower,
                                      executiveelectedby,
                                      partycountlower,
                                      tsl), by=c("country","idstd")) %>% 
  group_by(wbcode) %>% 
  fill(refusalrate:tsl,.direction="downup") %>% 
  ungroup %>% 
  mutate(tsl=scale(tsl),
         partycountlower=scale(as.numeric(partycountlower)))

wb_small <- left_join(wb_small,select(country_cov,idstd,powershift_exec),by=c("idstd"))

wb_small <- left_join(wb_small,select(ccov2,v2x_polyarchy,idstd,realGDPpc), by="idstd")

wb_small <- left_join(wb_small,new_ind,by="idstd")

# need to recode

wb_small <- wb_small %>% 
  mutate_if(is.character,as.factor) %>% 
  mutate_if(is.labelled,as_factor) %>% 
  mutate_if(is.factor,~fct_recode(.,NULL="Don't Know (Spontaneous)",
                                  NULL="-8",
                                  NULL="-9",
                                  NULL="Service  not offered  (spontaneous)",
                                  NULL="Refusal (Spontaneous)",
                                  NULL="-16",
                                  NULL="-7",
                                  NULL="Skipped",
                                  NULL="Does Not Apply",
                                  NULL="Still In Process",
                                  NULL="Not Provided",
                                  NULL="Application Denied",
                                  `1`="One day or less",
                                  `0`="One minute or less",
                                  `0`="No time was spent",
                                  NULL="Never regiestered (spontaneous)")) %>% 
  mutate_at(c("dir_laborobst",
              "dir_corrobst",
              "dir_poliobst",
              "dir_taxrobst",
              "dir_courtsobst",
              "dir_customsobst"),
            ~ordered(.,levels=c("Very severe obstacle",
                                "Major obstacle",
                                "Moderate obstacle",
                                "Minor obstacle",
                                "No obstacle"))) %>% 
  mutate_at(c("input_pc",
              "input_assn",
              "sel_elect",
              "bribe_elect",
              "sel_h20",
              "delay_h20",
              "bribe_h20",
              "sel_constr",
              "bribe_constr",
              "sel_import",
              "bribe_license",
              "sel_oper",
              "bribe_oper",
              "bribe_Mcustoms"),
            ~factor(.,levels=c("No","Yes"))) %>% 
  mutate(input_lobby=ordered(input_lobby,levels=c("Not at all useful",
                                                  "Not very useful",
                                                  "Somewhat useful",
                                                  "Very useful")),
         input_size=ifelse(input_size>100000,NA,input_size)) %>% 
  mutate_at(vars(`10`:`95`),factor,levels=c(0,1))

saveRDS(wb_small, paste0(save_loc, "/wb_small.rds"))

# remove any surveys that don't have political connections

miss_country <- group_by(wb_small,countryx) %>% 
  summarize(n_miss=sum(is.na(input_pc))/n()) %>% 
  filter(n_miss==1)

wb_small <- anti_join(wb_small,miss_country,by="countryx")

saveRDS(wb_small, paste0(save_loc, "/wb_small_nomiss.rds"))

# plot countries

sum_connections <- group_by(wb_data,country) %>% 
  summarize(n_connected=mean(BMb5[BMb5>0]==1),
            n_missing=mean(BMb5<0))

sum_connections <- group_by(wb_data,country) %>% 
  summarize(n_connected=mean(BMb5[BMb5>0]==1),
            n_missing=mean(BMb5<0))

# make a covariate matrix and fill in missing values with random forests

if(impute) {
  
  wb2 <- wb_small
  names(wb2)[67:118] <- paste0("sect_",names(wb2)[67:118])
  
  cov_matrix <- wb2 %>% 
    mutate(input_bribes2=factor(input_bribes2),
           input_bribes2=factor(input_bribes2),
           tsl=c(tsl),
           partycountlower=c(partycountlower)) %>% 
    select(-matches("country"),-region_old,-sector_3,-d1a2,-wt,-idstd) %>% 
    missRanger::missRanger(pmm.k = 10,
                           verbose=2) 
  
  cov_matrix$idstd <- wb2$idstd
  
  cov_matrix <- cov_matrix %>% 
    mutate(dir_timetax=as.numeric(as.character(dir_timetax))) %>% 
    select(idstd, exporter, ownership, size,
           dir_timetax, dir_customsobst, dir_courtsobst, 
           dir_taxrobst, dir_poliobst, dir_corrobst, dir_laborobst,
           executiveelectedby,firm_age,
           polconiii, j, legfralower, refusalrate, partycountlower,tsl,
           sel_h20, sel_elect,sel_constr,sel_oper,sel_import,agg_sector,
           powershift_exec,realGDPpc,man_exp,quality_cert,conglomerate,
           v2x_polyarchy)
  
  saveRDS(cov_matrix, paste0(save_loc, "/cov_matrix.rds"))
  
  
} else {
  
  cov_matrix <- readRDS( paste0(save_loc, "/cov_matrix.rds"))
}


# make the data for idealstan

to_idealstan <- select(wb_small,-`countryx`,-`country.x`,`country.y`,
                       -obs,-year,-region,-region_old,-wt,-ownership,-exporter,
                       -size,-a3a,-a3ax,-a2,-a2x,-strata,-d1a2,
                       -delay_h20,-firm_age,
                       -polconiii,-polconv,-j,-legfralower,
                       -executiveelectedby,-partycountlower,-quality_cert,-conglomerate,
                       -tsl,-refusalrate,-agg_sector,-powershift_exec,-realGDPpc,-man_exp,
                       -v2x_polyarchy) %>% 
  mutate(wbcode_sector=paste0(wbcode,"_",sector_3)) %>% 
  mutate_at(c("input_pc","input_bribes1","input_bribes2","input_assn"),
            ~as.numeric(factor(.)) - 1) %>% 
  mutate(input_lobby=as.numeric(factor(input_lobby))) %>% 
  select(-sector_3,-wbcode_sector) %>% 
  gather(key="item",value="outcome_disc",-idstd,-wbcode) 

to_idealstan <- mutate(ungroup(to_idealstan),outcome_cont=ifelse(item %in% c("input_size",
                                                                             "delay_constr",
                                                                             "delay_elect",
                                                                             "delay_oper",
                                                                             "input_govtown",
                                                                             "dir_timetax",
                                                                             "delay_Mcustoms",
                                                                             "delay_license",
                                                                             "delay_Xcustoms"),
                                                                 as.numeric(as.character(outcome_disc)),
                                                                 -1),
                       outcome_disc = ifelse(outcome_cont == -1,
                                             outcome_disc,
                                             0),
                       model_id=case_when(item %in% c("input_size",
                                                      "delay_constr",
                                                      "delay_elect",
                                                      "delay_oper",
                                                      "input_govtown",
                                                      "dir_timetax",
                                                      "delay_Mcustoms",
                                                      "delay_license",
                                                      "delay_Xcustoms") ~ 10,
                                          item %in% c("input_pc","input_bribes1",
                                                      "input_bribes2","input_assn")~2,
                                          item %in% as.character(1:100)~1,
                                          item %in% c("dir_corrobst",
                                                      "dir_courtsobst",
                                                      "dir_customsobst",
                                                      "dir_laborobst",
                                                      "dir_poliobst",
                                                      "dir_taxrobst",
                                                      "input_lobby")~ 4,
                                          TRUE ~ 2),
                       outcome_disc=as.integer(outcome_disc),
                       ordered_id=ifelse(item=="input_lobby",4,0)) %>% 
  group_by(item) %>% 
  mutate(outcome_cont=ifelse(model_id %in% c(10,9),as.numeric(scale(outcome_cont,
                                                                    scale=2*sd(outcome_cont,na.rm=T))),
                             outcome_cont))

# get rid of any items that only have two different values across countries

get_dist <- to_idealstan %>% 
  filter(model_id %in% c(9,10)) %>% 
  group_by(item) %>% summarize(n_un=length(unique(outcome_cont)))

to_idealstan <- anti_join(to_idealstan,filter( get_dist, n_un<3),
                          by="item")


to_idealstan <-  to_idealstan %>% 
  left_join(cov_matrix,
            by=c("idstd")) %>% 
  mutate(realGDPpc=scale(realGDPpc)) %>% 
  filter(grepl(x=item,pattern="input"),
         !(item %in% c("bribe_h20","sel_h20","perf1","perf2"))) %>% 
  mutate(agg_sector=factor(agg_sector))

saveRDS(to_idealstan, paste0(save_loc, "/model_data.rds"))

# load original data

to_make <- readRDS(paste0(save_loc, "/model_data_orig.rds"))

# to_make <- to_idealstan %>% 
#   id_make(group_id="wbcode",person_id="idstd",item_id = "item",
#           person_cov = ~agg_sector + exporter + ownership +
#             dir_timetax + dir_customsobst + dir_courtsobst + 
#             dir_taxrobst + dir_poliobst + dir_corrobst + firm_age +
#             man_exp + quality_cert + conglomerate + 
#           tsl + refusalrate + realGDPpc + v2x_polyarchy)

# use adjusted, informative priors to match earlier fit with different
# priors on the regular discrimination parameters

# wb_fit <- id_estimate(to_make,nchains = 3,ncores = parallel::detectCores(),
#                       fixtype = "prefix",vary_ideal_pts = "none",use_groups = T,
#                       restrict_sd_low= 9958,
#                       restrict_N_low = 42,
#                       restrict_N_high = 7310,person_sd = 1,
#                       restrict_sd_high = 2689,
#                       restrict_ind_high = "input_pc",
#                       restrict_ind_low= "input_assn",
#                       max_treedepth=12,
#                       const_type = "items",
#                       id_refresh = 100,niters = 250,warmup = 500)

wb_fit <- id_estimate(to_make,nchains = 3,ncores = parallel::detectCores(),
                      fixtype = "prefix",vary_ideal_pts = "none",use_groups = T,
                      grainsize=1,restrict_sd_high = .001,
                      restrict_ind_high = "input_pc",
                      restrict_ind_low= "input_assn",
                      seed=553718,
                      restrict_sd_low = 3,max_treedepth=12,
                      #save_files="~/wbsurvey",
                      fix_low=0,het_var=T,
                      const_type = "items",
                      id_refresh = 100,niters = 250,warmup = 500)

saveRDS(wb_fit,paste0( save_loc, "/wb_fit_",run,".rds"))

# save predicted scores for each value
# need to use original covariates + country intercepts



filter(wb_fit@summary,grepl(x=variable,pattern="sigma\\_reg\\_full")) %>% 
  mutate(variable=factor(variable,levels=unique(variable),
                         labels=levels(to_make@score_matrix$item_id))) %>% 
  ggplot(aes(y=mean,x=reorder(variable,mean))) +
  geom_pointrange(aes(ymin=lower,ymax=upper)) +
  geom_hline(yintercept = 0,linetype=2) +
  coord_flip() +
  ggthemes::theme_tufte() +
  labs(y="Discrimination",x="Item") +
  ggtitle("Observed Discrimination Parameters")

ggsave(paste0(save_loc, "items.png"))

# missing parameters

filter(wb_fit@summary,grepl(x=variable,pattern="sigma\\_abs\\_free")) %>% 
  mutate(variable=factor(variable,levels=unique(variable),
                         labels=levels(to_make@score_matrix$item_id))) %>% 
  ggplot(aes(y=mean,x=reorder(variable,mean))) +
  geom_pointrange(aes(ymin=lower,ymax=upper)) +
  geom_hline(yintercept = 0,linetype=2) +
  coord_flip() +
  ggthemes::theme_tufte() +
  labs(y="Discrimination",x="Item") +
  ggtitle("Missing Discrimination Parameters")

ggsave(paste0(save_loc, "items_missing.png"))
